{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocess MovieLens-20M "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import datetime\n",
    "import json\n",
    "import os\n",
    "import time\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "import matplotlib\n",
    "matplotlib.use('Agg')\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "import pandas as pd\n",
    "import scipy.sparse\n",
    "\n",
    "import seaborn as sns\n",
    "sns.set(context=\"paper\", font_scale=1.5, rc={\"lines.linewidth\": 2}, font='DejaVu Serif')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "DATA_DIR = '/hdd2/dawen/data/ml-20m/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def timestamp_to_date(timestamp):\n",
    "    return datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'), header=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# binarize the data (only keep ratings >= 4)\n",
    "raw_data = raw_data[raw_data['rating'] > 3.5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# sort the raw data accorindg to timestamp\n",
    "raw_data = raw_data.sort_index(by=['timestamp'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4182421</th>\n",
       "      <td>28507</td>\n",
       "      <td>1176</td>\n",
       "      <td>4.0</td>\n",
       "      <td>789652004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18950936</th>\n",
       "      <td>131160</td>\n",
       "      <td>47</td>\n",
       "      <td>5.0</td>\n",
       "      <td>789652009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15688196</th>\n",
       "      <td>108467</td>\n",
       "      <td>57</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12341186</th>\n",
       "      <td>85252</td>\n",
       "      <td>70</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452501</th>\n",
       "      <td>99851</td>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452517</th>\n",
       "      <td>99851</td>\n",
       "      <td>58</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452516</th>\n",
       "      <td>99851</td>\n",
       "      <td>55</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452515</th>\n",
       "      <td>99851</td>\n",
       "      <td>52</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452514</th>\n",
       "      <td>99851</td>\n",
       "      <td>50</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452513</th>\n",
       "      <td>99851</td>\n",
       "      <td>47</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452512</th>\n",
       "      <td>99851</td>\n",
       "      <td>45</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452509</th>\n",
       "      <td>99851</td>\n",
       "      <td>39</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452507</th>\n",
       "      <td>99851</td>\n",
       "      <td>32</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452506</th>\n",
       "      <td>99851</td>\n",
       "      <td>31</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452505</th>\n",
       "      <td>99851</td>\n",
       "      <td>21</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3056639</th>\n",
       "      <td>20821</td>\n",
       "      <td>32</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15688194</th>\n",
       "      <td>108467</td>\n",
       "      <td>11</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452504</th>\n",
       "      <td>99851</td>\n",
       "      <td>19</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452503</th>\n",
       "      <td>99851</td>\n",
       "      <td>18</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12341184</th>\n",
       "      <td>85252</td>\n",
       "      <td>60</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14452502</th>\n",
       "      <td>99851</td>\n",
       "      <td>10</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12341181</th>\n",
       "      <td>85252</td>\n",
       "      <td>50</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19424622</th>\n",
       "      <td>134445</td>\n",
       "      <td>11</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19424624</th>\n",
       "      <td>134445</td>\n",
       "      <td>21</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19424626</th>\n",
       "      <td>134445</td>\n",
       "      <td>45</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19424627</th>\n",
       "      <td>134445</td>\n",
       "      <td>58</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12341159</th>\n",
       "      <td>85252</td>\n",
       "      <td>2</td>\n",
       "      <td>4.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12341161</th>\n",
       "      <td>85252</td>\n",
       "      <td>7</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12341162</th>\n",
       "      <td>85252</td>\n",
       "      <td>10</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12341165</th>\n",
       "      <td>85252</td>\n",
       "      <td>17</td>\n",
       "      <td>5.0</td>\n",
       "      <td>822873600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19742824</th>\n",
       "      <td>136690</td>\n",
       "      <td>48394</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427775557</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19742805</th>\n",
       "      <td>136690</td>\n",
       "      <td>1136</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427775558</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19742831</th>\n",
       "      <td>136690</td>\n",
       "      <td>104841</td>\n",
       "      <td>4.5</td>\n",
       "      <td>1427775561</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480802</th>\n",
       "      <td>107073</td>\n",
       "      <td>745</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427776814</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480822</th>\n",
       "      <td>107073</td>\n",
       "      <td>5971</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427776816</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480805</th>\n",
       "      <td>107073</td>\n",
       "      <td>1148</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1427776833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480828</th>\n",
       "      <td>107073</td>\n",
       "      <td>92259</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1427776892</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480823</th>\n",
       "      <td>107073</td>\n",
       "      <td>6016</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427777118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480804</th>\n",
       "      <td>107073</td>\n",
       "      <td>858</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427777123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480826</th>\n",
       "      <td>107073</td>\n",
       "      <td>58559</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1427777129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480819</th>\n",
       "      <td>107073</td>\n",
       "      <td>4993</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427777155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480821</th>\n",
       "      <td>107073</td>\n",
       "      <td>5952</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427777157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480818</th>\n",
       "      <td>107073</td>\n",
       "      <td>4306</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427777158</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480825</th>\n",
       "      <td>107073</td>\n",
       "      <td>7153</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427777166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480816</th>\n",
       "      <td>107073</td>\n",
       "      <td>3793</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1427777169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15480798</th>\n",
       "      <td>107073</td>\n",
       "      <td>527</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427777203</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17877748</th>\n",
       "      <td>123613</td>\n",
       "      <td>109243</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1427779965</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378451</th>\n",
       "      <td>57814</td>\n",
       "      <td>7361</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1427780465</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378452</th>\n",
       "      <td>57814</td>\n",
       "      <td>7438</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427780468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378505</th>\n",
       "      <td>57814</td>\n",
       "      <td>108979</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427780517</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378409</th>\n",
       "      <td>57814</td>\n",
       "      <td>1527</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1427780519</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378407</th>\n",
       "      <td>57814</td>\n",
       "      <td>1274</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427780571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378413</th>\n",
       "      <td>57814</td>\n",
       "      <td>1748</td>\n",
       "      <td>4.5</td>\n",
       "      <td>1427780617</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378443</th>\n",
       "      <td>57814</td>\n",
       "      <td>6283</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427780623</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378393</th>\n",
       "      <td>57814</td>\n",
       "      <td>924</td>\n",
       "      <td>4.5</td>\n",
       "      <td>1427780631</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378423</th>\n",
       "      <td>57814</td>\n",
       "      <td>3527</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1427780657</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378468</th>\n",
       "      <td>57814</td>\n",
       "      <td>48774</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1427780663</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378405</th>\n",
       "      <td>57814</td>\n",
       "      <td>1240</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1427781001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8378415</th>\n",
       "      <td>57814</td>\n",
       "      <td>2311</td>\n",
       "      <td>4.5</td>\n",
       "      <td>1427781083</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12898527</th>\n",
       "      <td>89081</td>\n",
       "      <td>52458</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1427782288</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>9995410 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          userId  movieId  rating   timestamp\n",
       "4182421    28507     1176     4.0   789652004\n",
       "18950936  131160       47     5.0   789652009\n",
       "15688196  108467       57     4.0   822873600\n",
       "12341186   85252       70     4.0   822873600\n",
       "14452501   99851        1     4.0   822873600\n",
       "14452517   99851       58     5.0   822873600\n",
       "14452516   99851       55     4.0   822873600\n",
       "14452515   99851       52     4.0   822873600\n",
       "14452514   99851       50     5.0   822873600\n",
       "14452513   99851       47     5.0   822873600\n",
       "14452512   99851       45     4.0   822873600\n",
       "14452509   99851       39     5.0   822873600\n",
       "14452507   99851       32     5.0   822873600\n",
       "14452506   99851       31     5.0   822873600\n",
       "14452505   99851       21     5.0   822873600\n",
       "3056639    20821       32     5.0   822873600\n",
       "15688194  108467       11     4.0   822873600\n",
       "14452504   99851       19     4.0   822873600\n",
       "14452503   99851       18     4.0   822873600\n",
       "12341184   85252       60     4.0   822873600\n",
       "14452502   99851       10     4.0   822873600\n",
       "12341181   85252       50     5.0   822873600\n",
       "19424622  134445       11     4.0   822873600\n",
       "19424624  134445       21     5.0   822873600\n",
       "19424626  134445       45     5.0   822873600\n",
       "19424627  134445       58     5.0   822873600\n",
       "12341159   85252        2     4.0   822873600\n",
       "12341161   85252        7     5.0   822873600\n",
       "12341162   85252       10     5.0   822873600\n",
       "12341165   85252       17     5.0   822873600\n",
       "...          ...      ...     ...         ...\n",
       "19742824  136690    48394     5.0  1427775557\n",
       "19742805  136690     1136     5.0  1427775558\n",
       "19742831  136690   104841     4.5  1427775561\n",
       "15480802  107073      745     5.0  1427776814\n",
       "15480822  107073     5971     5.0  1427776816\n",
       "15480805  107073     1148     4.0  1427776833\n",
       "15480828  107073    92259     4.0  1427776892\n",
       "15480823  107073     6016     5.0  1427777118\n",
       "15480804  107073      858     5.0  1427777123\n",
       "15480826  107073    58559     4.0  1427777129\n",
       "15480819  107073     4993     5.0  1427777155\n",
       "15480821  107073     5952     5.0  1427777157\n",
       "15480818  107073     4306     5.0  1427777158\n",
       "15480825  107073     7153     5.0  1427777166\n",
       "15480816  107073     3793     4.0  1427777169\n",
       "15480798  107073      527     5.0  1427777203\n",
       "17877748  123613   109243     4.0  1427779965\n",
       "8378451    57814     7361     4.0  1427780465\n",
       "8378452    57814     7438     5.0  1427780468\n",
       "8378505    57814   108979     5.0  1427780517\n",
       "8378409    57814     1527     4.0  1427780519\n",
       "8378407    57814     1274     5.0  1427780571\n",
       "8378413    57814     1748     4.5  1427780617\n",
       "8378443    57814     6283     5.0  1427780623\n",
       "8378393    57814      924     4.5  1427780631\n",
       "8378423    57814     3527     4.0  1427780657\n",
       "8378468    57814    48774     4.0  1427780663\n",
       "8378405    57814     1240     5.0  1427781001\n",
       "8378415    57814     2311     4.5  1427781083\n",
       "12898527   89081    52458     4.0  1427782288\n",
       "\n",
       "[9995410 rows x 4 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tstamp = np.array(raw_data['timestamp'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Time span of the dataset: From 1995-01-09 06:46:44 to 2015-03-31 02:11:28\n"
     ]
    }
   ],
   "source": [
    "print(\"Time span of the dataset: From %s to %s\" % \n",
    "      (timestamp_to_date(np.min(tstamp)), timestamp_to_date(np.max(tstamp))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# apparently the timestamps are ordered, check to make sure\n",
    "\n",
    "for i in xrange(tstamp.size - 1):\n",
    "    if tstamp[i] > tstamp[i + 1]:\n",
    "        print(\"not ordered\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Confirmed the timestamps are ordered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaYAAAFBCAYAAAA8MAs5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X9UVXW++P/n4fBDFDgcQCVL8gdXEKS6lTHWoNaMjeOF\nhOIkWpndyiY1SrolzvoMydymiFlDpiVTt6zJEqPDMkcs53pthDXDJNc7NaMDaHARyYwKDhws5Yhn\nf//wy74i5wACh73hvB5rsZbs137v135v8LzY7/3eexsURVEQQgghdMJH6x0QQgghLiaFSQghhK5I\nYRJCCKErUpiEEELoihQmIYQQuiKFSQghhK749rXCD3/4Q6ZNmwaAoijYbDaioqLYsmULANXV1eTm\n5uLj44PJZCIvLw+TyaS237p1K6WlpRgMBhYtWsSDDz6oxtra2sjOzqatrQ2n00lOTg5xcXFqvKKi\ngoKCAnx9fZk6dSq5ubn4+/ur8fz8fCorK1EUheXLl7N48eLBHxEhhBDaUvqQm5vb7ftnn31W2blz\np6IoiuJwOJRbb71VqaysVBRFUTZt2qRkZmaq65aVlSk//elPFYfDoXR0dCgLFy5UDhw4oMbXrl2r\nbN68WVEURfnkk0+UefPmKQ6HQ1EURWlublYSExOVhoYGRVEUZd26dcoLL7ygtt2+fbty//33K4qi\nKC0tLcrNN9+sHD16tK/uCCGE0Lk+h/JycnLUfzudTv74xz9y++23A1BeXo7RaGT27NkAWCwW9u3b\nh81mA6C4uJjk5GT8/Pzw9/cnJSWFHTt2ANDa2srevXtJT08HIDExET8/Pw4cOABAaWkpsbGxREVF\nAZCeno7VakX5/+8HLi4uJi0tDQCz2cz8+fMpLi4edKEWQgihrcu6xlRRUcF1113H2LFjATh8+LA6\nzAcQGRnJmDFjqKqqchmPjo7myJEjAFRVVREQEEBkZKQanz59uhp31dZut9PQ0IDD4eDo0aNuty2E\nEGLkuqzCtHv37m7XcZqbmwkKCuq2TkhICM3NzWo8ODhYjQUHB9PS0jKgtiEhIepym82G0+l0u20h\nhBAjV78L09mzZ/nss8/44Q9/2G25wWDosa5y0eP3XMX729aV3rbdV1shhBD61+esvC779+9n/vz5\n3YpBWFgYNTU13daz2+1ERESocbvdrsba29sxm80AhIeH097e3qPtjBkz1LYXx7u2ExERgdlsxsfH\np8e2w8PDe+3DmTMOfH2N/e3ysDMaDZw/r01x1TK3t+eXvntn37XOr3XfAfz8XH8e97sw7d69m8zM\nzG7LEhIS+Oijj9TvT506RUdHB/Hx8Wq8vr5ejdfW1pKQkABAXFwcDoeDpqYmJk6cCEBdXZ06GSIh\nIUGdCNHV1mQyERUVhY+PDzExMdTX16vbq62tZdasWb324fTpjv52VxOhoWNpbf3e63J7e37pu3f2\nXev8WvcdYPz4YJfL+zWUZ7PZ+Oqrr7rdYwQwd+5cOjs7OXToEAAlJSUsWLCA0NBQADIyMtizZw8O\nh4OOjg5KS0tZunQpAKGhoSxcuBCr1QpAZWUlnZ2dzJ07F4CUlBRqamo4ceKEum2LxYKPz4VdXrJk\nCTt37lT3r6ysDIvF0v8jIoQQQpf6dcb00UcfsWjRoh7L/f392bJlCxs2bMBoNBISEkJeXp4aT0pK\noq6ujoyMDAwGAxaLRS08cGEqenZ2NsuWLcPpdFJYWKjeQBsWFkZBQQFZWVn4+voyZcqUbmdsGRkZ\nNDY2kp6ejqIorFu3jpiYmAEfCCGEEPpgULxoxsA337T3vZKGvPm03pvzS9+9s+9a59e67zDIoTwh\nhBBiuEhhEkIIoStSmIQQQuiKFCYhhBC6IoVJCCGErkhhEkIIoStSmIQQQuiKFCYhhBC6IoVJCCGE\nrkhhEkIIoStSmIQQQuiKFCYhhBC6IoVJCCGErkhhEkIIoStSmIQQQuiKFCYhhBC6IoVJCCGErkhh\nEkIIoStSmIQQQuiKFCYhhBC6IoVJCCGErvhqvQNCuOJ0Ojl58guXsSuvvAofH/mbSojRSgqT0KWT\nJ7/gsfzdjAkO77b8bHszm59OYfLkKI32TAjhaVKYhG6NCQ5nbMgErXdDCDHM+l2Y3n//fUpKSjAa\njZw+fZpf/OIX3HjjjVRXV5Obm4uPjw8mk4m8vDxMJpPabuvWrZSWlmIwGFi0aBEPPvigGmtrayM7\nO5u2tjacTic5OTnExcWp8YqKCgoKCvD19WXq1Knk5ubi7++vxvPz86msrERRFJYvX87ixYsHezyE\nEEJorF8D9R999BH79+9n+/btvPvuuzzwwAN88803nDt3jtWrV/Pkk0+yfft24uLiyMnJUduVl5dj\ntVp57733KCoqwmq1UlZWpsZzc3OJj49n+/btrF27llWrVnHu3DkAWlpayMrKoqCggB07dqAoChs3\nblTbFhUVUVVVhdVq5fXXXyc/P59jx44N1XERQgihkX4Vpi1btvDII4+oF5xTU1P56U9/Snl5OUaj\nkdmzZwNgsVjYt28fNpsNgOLiYpKTk/Hz88Pf35+UlBR27NgBQGtrK3v37iU9PR2AxMRE/Pz8OHDg\nAAClpaXExsYSFXXhWkJ6ejpWqxVFUdRtp6WlAWA2m5k/fz7FxcVDcUyEEEJoqM/C1NLSwueff87x\n48e5//77uffeeykqKgLg8OHDTJs2TV03MjKSMWPGUFVV5TIeHR3NkSNHAKiqqiIgIIDIyEg1Pn36\ndDXuqq3dbqehoQGHw8HRo0fdblsIIcTI1ec1pi++uDBl98MPP2Tr1q20tbWRnp6OyWSiubmZoKCg\nbuuHhITQ3NwMQHNzM8HBwWosODiYlpYWNdZX20mTJnWLdS0PDAzE6XS63bYQQoiRq88zJofDAUBG\nRgZGo5GwsDDuuOMOSkpKMBgMGAyGHm26htsAl/HeYhe3daW3bffVVgghhP71ecbUNcMuPPz/7ieZ\nMGECTU1NJCQk0NTU1G19u91OREQEAGFhYdjtdjXW3t6O2WxWt9fe3t6j7YwZM9S2F8e7thMREYHZ\nbMbHx6fHti/eR1eCggLw9TX21WXNGI0+hIaO9brcrvK3tQW6XTckJHDI91WOvfTd2/Jr3ffe9FmY\nrr76agIDA7sNk9lsNiZMmEBCQgJ79uxRl586dYqOjg7i4+MBSEhIoL6+Xo3X1taSkJAAQFxcHA6H\ng6amJiZOnAhAXV2dOhkiISFBnQjR1dZkMhEVFYWPjw8xMTHU19er26utrWXWrFm99uX06Y6+uqup\n0NCxtLZ+73W5XeW328+4XdduPzPk+yrHXvrubfm17jvA+PHBLpf3OZTn7+9PamoqH3zwAQBnz55l\n7969pKWlkZSUxPnz5zl06BAAJSUlLFiwgNDQUODC8N+ePXtwOBx0dHRQWlrK0qVLAQgNDWXhwoVY\nrVYAKisr6ezsZO7cuQCkpKRQU1PDiRMn1G1bLBZ1ZuCSJUvYuXMncKFQlpWVYbFYBnZ0hBBC6Ea/\nbrB9+umn2bBhA6mpqYwZM4a0tDRSUlKAC1PJN2zYgNFoJCQkhLy8PLVdUlISdXV1ZGRkYDAYsFgs\nauEByMnJITs7m2XLluF0OiksLFRvoA0LC6OgoICsrCx8fX2ZMmUKmZmZatuMjAwaGxtJT09HURTW\nrVtHTEzMkBwUIYQQ2jEoXjRj4Jtv2vteSUPefFp/af7GxhM8VVjR45FE39u/5teP3jzkz8qTYy99\n97b8WvcdBjGUJ4QQQgwnKUxCCCF0RQqTEEIIXZHCJIQQQlekMAkhhNAVKUxCCCF0RQqTEEIIXZHC\nJIQQQlekMAkhhNAVKUxCCCF0RQqTEEIIXZHCJIQQQlekMAkhhNAVKUxCCCF0RQqTEEIIXZHCJIQQ\nQlekMAkhhNAVKUxCCCF0RQqTEEIIXZHCJIQQQlekMAkhhNAVKUxCCCF0RQqTEEIIXZHCJIQQQld8\n+1ph/fr1nDx5EgBFUTAYDLz66qsEBgYCUF1dTW5uLj4+PphMJvLy8jCZTGr7rVu3UlpaisFgYNGi\nRTz44INqrK2tjezsbNra2nA6neTk5BAXF6fGKyoqKCgowNfXl6lTp5Kbm4u/v78az8/Pp7KyEkVR\nWL58OYsXLx78ERFCCKGpPgsTwNtvv+1y+blz51i9ejUvvPACs2fPZvPmzeTk5PDSSy8BUF5ejtVq\nZdeuXSiKwuLFi4mOjmbevHkA5ObmEh8fz5o1azh48CCrVq1i3759+Pn50dLSQlZWFsXFxURFRZGd\nnc3GjRt5+umnASgqKqKqqgqr1YrNZiM5OZmZM2cyY8aMoTguQgghNDKoobzy8nKMRiOzZ88GwGKx\nsG/fPmw2GwDFxcUkJyfj5+eHv78/KSkp7NixA4DW1lb27t1Leno6AImJifj5+XHgwAEASktLiY2N\nJSoqCoD09HSsViuKoqjbTktLA8BsNjN//nyKi4sH0x0hhBA60K/C9Mwzz3DPPfewcuVKDh48qC4/\nfPgw06ZNU7+PjIxkzJgxVFVVuYxHR0dz5MgRAKqqqggICCAyMlKNT58+XY27amu322loaMDhcHD0\n6FG32xZCCDFy9TmUN336dH7wgx8wa9YsDh8+zPLlyykqKiI2Npbm5maCgoK6rR8SEkJzczMAzc3N\nBAcHq7Hg4GBaWlrUWF9tJ02a1C3WtTwwMBCn0+l220IIIUauPs+YHnroIWbNmgVAQkIC8+fP5733\n3lPjBoOhR5uu4TZ38f62daW3bffVVgghhP71a/LDxa644grq6uoACAsLo6amplvcbrcTERGhxu12\nuxprb2/HbDYDEB4eTnt7e4+2XZMXwsLCusW7thMREYHZbMbHx6fHtsPDw3vd96CgAHx9jZfV3+Fk\nNPoQGjrW63K7yt/WFuh23ZCQwCHfVzn20ndvy69133vTZ2F6/fXXeeihh9Tvm5ubmTBhAnDhDOqj\njz5SY6dOnaKjo4P4+Hg1Xl9fr8Zra2tJSEgAIC4uDofDQVNTExMnTgSgrq5OnQyRkJCgToToamsy\nmYiKisLHx4eYmBjq6+vV7dXW1qpndu6cPt3RV3c1FRo6ltbW770ut6v8dvsZt+va7WeGfF/l2Evf\nvS2/1n0HGD8+2OXyPofy3nrrLfXaTWNjIx9//LF6v9DcuXPp7Ozk0KFDAJSUlLBgwQJCQ0MByMjI\nYM+ePTgcDjo6OigtLWXp0qUAhIaGsnDhQqxWKwCVlZV0dnYyd+5cAFJSUqipqeHEiRPqti0WCz4+\nF3Z5yZIl7Ny5EwCbzUZZWRkWi2UAh0YIIYSe9HnG9OCDD7J69Wp8fX05c+YMOTk53HjjjQD4+/uz\nZcsWNmzYgNFoJCQkhLy8PLVtUlISdXV1ZGRkYDAYsFgsauEByMnJITs7m2XLluF0OiksLFRvoA0L\nC6OgoICsrCx8fX2ZMmUKmZmZatuMjAwaGxtJT09HURTWrVtHTEzMkB0YIYQQ2jAoXjRj4Jtv2vte\nSUPefFp/af7GxhM8VVjB2JAJ3db73v41v370ZiZPjvJo/uGkt2PvLbm9Pb/WfYdBDOUJIYQQw0kK\nkxBCCF2RwiSEEEJXpDAJIYTQlcu+wVYIvXE6nZw8+YXL2JVXXqXeYiCEGBmkMIkR7+TJL3gsfzdj\ngrs/+eNsezObn04Z8hl8QgjPksIkRoUxweE9ppYLIUYmGeMQQgihK1KYhBBC6IoUJiGEELoihUkI\nIYSuSGESQgihK1KYhBBC6IoUJiGEELoihUkIIYSuSGESQgihK1KYhBBC6IoUJiGEELoihUkIIYSu\nSGESQgihK/J08VHE3XuJRtM7iRTFyalTX3Zbdun3QoiRTQrTKOLqvUSj7Z1EZ0/beG7b/zAm+Li6\nrPWrWkIjo7XbKSHEkJLCNMp4w3uJLu3jmfZmDfdGCDHUpDAJMYLJa+XFaHRZhen48eMkJyfz5ptv\nMnv2bACqq6vJzc3Fx8cHk8lEXl4eJpNJbbN161ZKS0sxGAwsWrSIBx98UI21tbWRnZ1NW1sbTqeT\nnJwc4uLi1HhFRQUFBQX4+voydepUcnNz8ff3V+P5+flUVlaiKArLly9n8eLFAz4QQoxE8lp5MRpd\n1p9TmzZtws/PT/3+3LlzrF69mieffJLt27cTFxdHTk6OGi8vL8dqtfLee+9RVFSE1WqlrKxMjefm\n5hIfH8/27dtZu3Ytq1at4ty5cwC0tLSQlZVFQUEBO3bsQFEUNm7cqLYtKiqiqqoKq9XK66+/Tn5+\nPseOHRvwgRBipOoa2rz469JCJcRI0u/CdPjwYcaNG0dYWJi6rKysDKPRqJ49WSwW9u3bh81mA6C4\nuJjk5GT8/Pzw9/cnJSWFHTt2ANDa2srevXtJT08HIDExET8/Pw4cOABAaWkpsbGxREVd+IsvPT0d\nq9WKoijqttPS0gAwm83Mnz+f4uLiwRwLIYQQOtDvwrR582bWrFmjFgaAI0eOMG3aNPX7yMhIxowZ\nQ1VVFXChmF0cj46O5siRIwBUVVUREBBAZGSkGp8+fboad9XWbrfT0NCAw+Hg6NGjbrcthBBi5OpX\nYSovL+ef/umfmDhxYrflzc3NBAUFdVsWEhJCc3OzGg8ODlZjwcHBtLS0DKhtSEiIutxms+F0Ot1u\nWwghxMjVZ2FSFIX/+I//4Gc/+5nLuMFgcNmmt3h/27rbH3ft+2orhBBC//qclbd7926SkpK6nZ10\nCQsLo6amptsyu91ORESEGrfb7Wqsvb0ds9kMQHh4OO3t7T3azpgxQ217cbxrOxEREZjNZnx8fHps\nOzy89wu+QUEB+Poa++qyZoxGH0JDxw64fVtboMvlISGBfW53sLkH69L87vpyufrTd1f5h9Ngcvd2\nnEZ73yX/yM3dlz4L06FDh6itreVPf/oTiqLQ3NzMc889x6RJk7jzzjv58MMP1XVPnTpFR0cH8fHx\nACQkJFBfX6/Ga2trSUhIACAuLg6Hw0FTU5M6RFhXV6dOhkhISFAnQnS1NZlMREVF4ePjQ0xMDPX1\n9er2amtrmTVrVq99OX26oz/HRDOhoWNpbf1+wO3t9jNul/e13cHmHqxL87vry+XqT99d5R9Og8nd\n23Ea7X2X/CM3d5fx43ue8EA/hvJ++ctfsn37dt5++222bdtGeHg4P//5z3nllVdISkri/PnzHDp0\nCICSkhIWLFhAaGgoABkZGezZsweHw0FHRwelpaUsXboUgNDQUBYuXIjVagWgsrKSzs5O5s6dC0BK\nSgo1NTWcOHFC3bbFYlFvGFyyZAk7d+4EwGazUVZWhsViGfABEkIIoQ/9vsG2pqaGLVu2YLPZ2LRp\nEwsWLGD58uW88sor5ObmYjQaCQkJIS8vT22TlJREXV0dGRkZGAwGLBaLWngAcnJyyM7OZtmyZTid\nTgoLC9UbaMPCwigoKCArKwtfX1+mTJlCZmam2jYjI4PGxkbS09NRFIV169YRExMzFMdECCGEhvpd\nmGJjY9m0aVOP5TNnzlTvTXJlxYoVrFixwmXMZDJRWFjotu2cOXPUMypXnnrqKfc7LIQQYkSSZ+WN\ncq5eE9FFnqUmhNAjKUyjnKvXRIA8S00IoV9SmLyAN7wKQwgxesg4jhBCCF2RwiSEEEJXpDAJIYTQ\nFSlMQgghdEUKkxBCCF2RwiSEEEJXpDAJIYTQFSlMQgghdEUKkxBCCF2RwiSEEEJXpDAJIYTQFSlM\nQgghdEUe4io05XQ6OXnyC9raAru9JtzdqzqEEKOfFCahqZMnv+Cx/N2MCQ7vtrz1q1pCI6M12ish\nhJakMAnNuXotx5n2Zo32RgihNbnGJIQQQlekMAkhhNAVKUxCCCF0RQqTEEIIXZHCJIQQQlekMAkh\nhNCVfk0X37ZtG/v370dRFGw2G3fffTf33nsvANXV1eTm5uLj44PJZCIvLw+TyaS23bp1K6WlpRgM\nBhYtWsSDDz6oxtra2sjOzqatrQ2n00lOTg5xcXFqvKKigoKCAnx9fZk6dSq5ubn4+/ur8fz8fCor\nK1EUheXLl7N48eJBHxAhhBDa6ldhKikp4c0338RsNtPQ0MCiRYu49tpriY2NZfXq1bzwwgvMnj2b\nzZs3k5OTw0svvQRAeXk5VquVXbt2oSgKixcvJjo6mnnz5gGQm5tLfHw8a9as4eDBg6xatYp9+/bh\n5+dHS0sLWVlZFBcXExUVRXZ2Nhs3buTpp58GoKioiKqqKqxWKzabjeTkZGbOnMmMGTM8dKiEEEIM\nh34N5eXn52M2mwG4+uqrCQkJ4eTJk5SXl2M0Gpk9ezYAFouFffv2YbPZACguLiY5ORk/Pz/8/f1J\nSUlhx44dALS2trJ3717S09MBSExMxM/PjwMHDgBQWlpKbGwsUVFRAKSnp2O1WlEURd12WloaAGaz\nmfnz51NcXDwUx0QIIYSG+lWYLj4L+cMf/sC4ceP44Q9/yOHDh5k2bZoai4yMZMyYMVRVVQH0iEdH\nR3PkyBEAqqqqCAgIIDIyUo1Pnz5djbtqa7fbaWhowOFwcPToUbfbFkIIMXL1+5FEx44dY+3atZw5\nc4aNGzcSFBREc3MzQUFB3dYLCQmhufnC42Sam5sJDg5WY8HBwbS0tKixvtpOmjSpW6xreWBgIE6n\n0+22hRBCjFz9LkwzZsxgz549VFdX8/DDD7NlyxYADAZDj3W7htvcxXuLXdzWld623VdbIYQQ+nfZ\nD3GdOXMm8+bNY9u2bUyaNImamppucbvdTkREBABhYWHY7XY11t7erl6rCg8Pp729vUfbrmHDsLCw\nbvGu7URERGA2m/Hx8emx7fDw7k+ovlRQUAC+vsbL7fKwMRp9CA0dO+D2bW2Bl7V+SEigmm+wuQfq\ncvf5cl3cx95o1f/B5u7t+I32vkv+kZu7L30WptbWViorK7n99tvVZYGBgdjtdq655ho+/PBDdfmp\nU6fo6OggPj4egISEBOrr69V4bW0tCQkJAMTFxeFwOGhqamLixIkA1NXVqZMhEhIS1IkQXW1NJhNR\nUVH4+PgQExNDfX29ur3a2lpmzZrVa19On+7oq7uaCg0dS2vr9wNuf/H7jPq7fle+weYeqMvd54Fs\nvz/90qr/g83d2/Eb7X2X/CM3d5fx44NdLu9z8sN3333Hli1bOHv2LAAtLS3s37+fOXPmkJSUxPnz\n5zl06BBwYVr5ggULCA0NBSAjI4M9e/bgcDjo6OigtLSUpUuXAhAaGsrChQuxWq0AVFZW0tnZydy5\ncwFISUmhpqaGEydOqNu2WCz4+FzY5SVLlrBz504AbDYbZWVlWCyWgR0dIYQQutHnGdP48eO57bbb\nWLFiBX5+frS3t5Oens6yZcsA2LJlCxs2bMBoNBISEkJeXp7aNikpibq6OjIyMjAYDFgsFrXwAOTk\n5JCdnc2yZctwOp0UFhaqN9CGhYVRUFBAVlYWvr6+TJkyhczMTLVtRkYGjY2NpKenoygK69atIyYm\nZsgOjBBCCG30WZj8/f3JzMzsVhQuFhsbq96b5MqKFStYsWKFy5jJZKKwsNBt2zlz5qhnVK489dRT\nbmNCCCFGJnmDrRBCF5xOJydPfuEyduWVV6nD+GL0k8IkhNCFkye/4LH83YwJ7j679mx7M5ufTmHy\n5CiN9kwMNylMwuu4+su8rS2Q4OBw+atcY2OCwxkbMkHr3RAak8IkvI6rv8zlr3Ih9EMKk/BK8pe5\nEPol4xZCCCF0RQqTEEIIXZHCJIQQQlekMAkhhNAVKUxCCCF0RQqTEEIIXZHCJIQQQlekMAkhhNAV\nKUxCCCF0RQqTEEIIXZFHEgnhJdy9VkIeYCv0RgqTGLUUxcmpU1/2WO5qmTeQ10qIkUIKkxi1zp62\n8dy2/2FM8PFuy1u/qiU0MlqbndKYPLxWjARSmMSo5uqD+Ex7s0Z7M3xcnS1665miGHmkMHmpSz+4\n2toCsdvPAN75Gmt3w36gj+Ph7vqQu312dbbozWeKYmSRwuSl3A1zeev1Br0fD3fXh3orNpeeLXrD\nmaIYHaQweTG53tCd3o/HcA9LujtL08MZpBjdpDAJIVySV9ALrUhhEkK4pfezSDE6SWESYgDcDXNB\n/4e6urZx8cSTy92GEKNRn4Wps7OTd955h/379wNw7tw5Hn/8cebMmQNAdXU1ubm5+Pj4YDKZyMvL\nw2Qyqe23bt1KaWkpBoOBRYsW8eCDD6qxtrY2srOzaWtrw+l0kpOTQ1xcnBqvqKigoKAAX19fpk6d\nSm5uLv7+/mo8Pz+fyspKFEVh+fLlLF68ePBHRIh+GIqbVeWGVyFc67MwNTU18c4777Br1y7GjRtH\nRUUFq1at4g9/+ANms5nVq1fzwgsvMHv2bDZv3kxOTg4vvfQSAOXl5VitVnbt2oWiKCxevJjo6Gjm\nzZsHQG5uLvHx8axZs4aDBw+yatUq9u3bh5+fHy0tLWRlZVFcXExUVBTZ2dls3LiRp59+GoCioiKq\nqqqwWq3YbDaSk5OZOXMmM2bM8ODhEuL/DMUwlwyVCU9zd3YfEqLfz8o+xwrGjRtHZmYm48aNA+Dm\nm28mICCATz/9lPLycoxGI7NnzwbAYrGwb98+bDYbAMXFxSQnJ+Pn54e/vz8pKSns2LEDgNbWVvbu\n3Ut6ejoAiYmJ+Pn5ceDAAQBKS0uJjY0lKurCX43p6elYrVYURVG3nZaWBoDZbGb+/PkUFxcP1XER\nQohRoevM/KnCCvXrsfzdNDY2ar1rbvVZmEJDQ7njjju6LTt37hxhYWEcPnyYadOmqcsjIyMZM2YM\nVVVVAD3i0dHRHDlyBICqqioCAgKIjIxU49OnT1fjrtra7XYaGhpwOBwcPXrU7baFGMm6bvZtbDyh\nfslTG8RgdJ2Zd31dOnysN5c9+eHgwYNceeWVzJ49m9///vcEBQV1i4eEhNDcfOHeiubmZoKDg9VY\ncHAwLS0taqyvtpMmTeoW61oeGBiI0+l0u20htOLuCRKXM5lBz09t0PsTMsTocFmFqaOjgxdffJEX\nXnhBXWYwGHqs1zXc5i7e37au9LbtvtoGBQXg62vsdR0tGY0+hIaOHXD7trbAIdmPkJDAQe3H5Riq\nffYkV8d1WQLhAAAevUlEQVTD3X67Kipn25v53bN3c/XVV/drGzD8T23obx97e0KGqz46nU63Q0aT\nJ0/u9jvf2/Hw1O/kYP/PjYT87o6r1n3vzWUVppycHB544AFmzpwJQFhYGDU1Nd3WsdvtREREqHG7\n3a7G2tvbMZvNAISHh9Pe3t6jbdfkhbCwsG7xru1ERERgNpvx8fHpse3w8N5PT0+f7ric7g670NCx\ntLZ+P+D2l045Hsx2BrMfl5tL71wdj97229WEhsvdxnC7nP1zN2HD1TYaG0/0OvMwISFWbdPb8fDU\n7+Rg/8+NhPzujuv5805N+w4wfnywy+X9Pu/Oy8vj2muv5Sc/+QkOh4NTp06RkJBAfX29us6pU6fo\n6OggPj4eoEe8traWhIQEAOLi4nA4HDQ1Nanxuro6rrnmGrdtTSYTUVFR+Pv7ExMT0yM+a9as/nZH\nCDFMLr2+MRKucQht9aswvfbaa3R2dpKamsr333/PiRMnKCkpYe7cuXR2dnLo0CEASkpKWLBgAaGh\noQBkZGSwZ88eHA4HHR0dlJaWsnTpUuDCpIqFCxditVoBqKyspLOzk7lz5wKQkpJCTU0NJ06cULdt\nsVjUMewlS5awc+dOAGw2G2VlZVgslqE6LkJ4DVeTLTw94aIrZ0NDg0zwED30OZR3/PhxCgoKMBgM\nvPvuuyiKgsFgYM2aNfj7+/PKK6+Qm5uL0WgkJCSEvLw8tW1SUhJ1dXVkZGRgMBiwWCxq4YELQ4PZ\n2dksW7YMp9NJYWGhegNtWFgYBQUFZGVl4evry5QpU8jMzFTbZmRk0NjYSHp6OoqisG7dOmJiYoby\n2AjhFbR4oeJwT/CQ18qPLH0WpilTpvS4jnSxmTNnqvcmubJixQpWrFjhMmYymSgsLHTbds6cOeoZ\nlStPPfWU25gQov+0eKHicE7wkKdsjCzyrDwhPMzdFGsZuhperorvUEzvF0NPCpMQHqbFUNlo4sl7\np9xN75ezKG1JYRJiGGgxVDZaePrtwvK8Qv2RwiSE0D0pHt5FCpMQvZDrQ0IMPylMQvRCrg/1j14K\nuLtp4fKHxMgihUmIPsj1ob7ppYC7mxauxT1SILP7BkoKkxBiSOilgA/3fsg9UkNPCpMQQlxkINPT\nZXLG0JLCJIQQF/H09HTRNylMQghxCTkD0pYUJiHEiORqyE1m340OUpiEECOSnl9B786lM/ja2gLV\nF/nJDL7/I4VJCDFiDfcr6AdLZvD1jxQmIYQYRnL9qm9SmIQQwgPkGtjASWESQggPGInXwPRCCpMQ\nQnjISLsGphdSmIQQoh/08qBabyCFSQgh+sGTD6r15Ft6RyIpTEII0U+eekCsPAapOylMQgihA66K\nnrszqdF+FiWFSQghdMrVmZQ3nEVJYRJCCB3zxhty+30ueOTIEW6//XY++OCDbsurq6vJyMhg2bJl\nPProo7S1tXWLb926lTvvvJO77rqLN954o1usra2NRx99lGXLlpGRkUFVVVW3eEVFBenp6WRkZLB+\n/XocDke3eH5+Punp6dx1113s2rWrv10RQgihY/0qTPv37+eNN94gKCio2/Jz586xevVqnnzySbZv\n305cXBw5OTlqvLy8HKvVynvvvUdRURFWq5WysjI1npubS3x8PNu3b2ft2rWsWrWKc+fOAdDS0kJW\nVhYFBQXs2LEDRVHYuHGj2raoqIiqqiqsViuvv/46+fn5HDt2bFAHQwgh9K7rulNj44keX06nU+vd\nGxL9Kkzx8fG8+OKLjBs3rtvy8vJyjEYjs2fPBsBisbBv3z5sNhsAxcXFJCcn4+fnh7+/PykpKezY\nsQOA1tZW9u7dS3p6OgCJiYn4+flx4MABAEpLS4mNjSUq6sI4anp6OlarFUVR1G2npaUBYDabmT9/\nPsXFxYM5FkIIoXtd152eKqzo9vVY/u5uTy4fyfpVmCIjI10uP3z4MNOmTeu23pgxY9QhuUvj0dHR\nHDlyBICqqioCAgK6bXv69Olq3FVbu91OQ0MDDoeDo0ePut22EEKMZl3XnS7+uvSJ5SPZoOYbNjc3\n9xjeCwkJobm5WY0HBwerseDgYFpaWgbUNiQkRF1us9lwOp1uty2EEGLkGvSsPIPB0GNZ13Cbu3h/\n27rS27b7ahsUFICvr7HXdbRkNPoQGjp2wO3b2gKHZD9CQgIHtR+XY6j2WQhvpyhO2ttbevyfam93\n/Qf7YD9vPGlQhSksLIyamppuy+x2OxEREWrcbrersfb2dsxmMwDh4eG0t7f3aDtjxgy17cXxru1E\nRERgNpvx8fHpse3w8N5PZU+f7rjcLg6r0NCxtLZ+P+D2XW/CHCy7/cyg9uNycwkhBu/saRs/L/xz\njyE9d49MOn/eOWz/z90ZPz7Y5fJBDeUlJCRQX1+vfn/q1Ck6OjqIj493Ga+trSUhIQGAuLg4HA4H\nTU1Naryuro5rrrnGbVuTyURUVBT+/v7ExMT0iM+aNWsw3RFCiBHN1bWngHFmrXfrsg2qMM2dO5fO\nzk4OHToEQElJCQsWLCA0NBSAjIwM9uzZg8PhoKOjg9LSUpYuXQpAaGgoCxcuxGq1AlBZWUlnZydz\n584FICUlhZqaGk6cOKFu22KxqI/hWLJkCTt37gTAZrNRVlaGxWIZTHeEEELoQL+G8o4dO8bLL79M\nXV0d7777Ln/961/55S9/ib+/P1u2bGHDhg0YjUZCQkLIy8tT2yUlJVFXV0dGRgYGgwGLxaIWHoCc\nnByys7NZtmwZTqeTwsJC/P39gQtDeQUFBWRlZeHr68uUKVPIzMxU22ZkZNDY2Eh6ejqKorBu3Tpi\nYmKG6riIfnA6nW6np472Z3kJITynX4VpxowZbNq0yWUsNjZWvTfJlRUrVrBixQqXMZPJRGFhodu2\nc+bMUc+oXHnqqafcxoTnnTz5BY/l7+4xpu0Nz/ISQniOPCtPdHO5TzP2xud4CSE8SwqT6MZbn2Ys\nhNAPKUyiBzkLEkJoSa5OCyGE0BUpTEIIIXRFhvJEn9xNiHC1TAghBksKk+iTqwkR4P5RJ0IIMRhS\nmES/uJoQcaa9WaO9EUKMZlKYxJBzN/QH8kQIIUTfpDCJIedu6E/uhxJC9IcUJuERci+UEGKgZExF\nCCGErkhhEkIIoStSmIQQQuiKFCYhhBC6IoVJCCGErsisPDFsXN3fJI81EkJcSgqTGDau7m+SxxoJ\nIS4lhUkMq0vvb5LHGgkhLiXXmIQQQuiKFCYhhBC6IoVJCCGErkhhEkIIoStSmIQQQujKiJ6V53A4\neOaZZ/jf//1fzp8/z9q1a7nlllu03i0hhBCDMKIL06ZNmwB47733OH78OEuWLOGjjz4iLCxM4z0T\nQggxUCN2KE9RFKxWK3fddRcAU6ZMYebMmfz+97/XeM+EEEIMxogtTI2NjbS1tTFt2jR1WXR0NIcP\nH9Zwr4QQQgzWiB3K+/bbbwEIDg5WlwUHB1NXV6fVLnlEY+OJfq976tSXnL3kSQod39kwuFj3cpYP\nxTY8uW29bEPv+6eXbeh9/7yhj5d+TujNiC1MXQyG7odcURS3644fH+w2pheX7uP48fGX0Tqef/mX\nBUO7Q0IIMcxG7FBeeHg4AHa7XV3W3t6uLhdCCDEyjdjCFBUVhclkor6+Xl1WW1tLQkKChnslhBBi\nsEZsYTIYDNx9992UlJQAcPz4cWpqakhJSdF4z4QQQgyGQentoozOXXqD7ZNPPsmcOXO03i0hhBCD\nMKILkxBCiNFnxA7lCSGEGJ2kMAkhhNAVKUxCCCF0RQqTEF7O4XBovQtCdCOTH4TK4XDg7++v9W54\n1LFjxyguLubIkSO0tLSgKArh4eHMmjULi8VCTEyMZvt2/vx5jEbjsOddvnw5b7/9Ni+++CJr1671\nSI7Fixdz++23c8cddzB58mSP5OiN1j/36upq/vKXvxAWFsbtt9/O2LFjAWhoaOC3v/0tBoOB5557\nzmP59+7dy8mTJ/nJT37CVVddxf79+9m6dStnzpxhzpw5ZGZmEhAQ4LH8l0sKk05p8SE12j+gdu3a\nRX5+PvPnzyc6Olp9zmJ7ezu1tbWUlZWxbt06ze6Fe/zxx3nppZc8su0PPvjAbey1115j5cqVvPvu\nu7z//vseyX/nnXdyxx13UFpaip+fH4sXL2bRokWEhIR4JN/FtP65f/jhh6xfv57o6GgcDgetra08\n++yzzJs3D4DvvvuOG2+8kerqao/k37hxI0VFRUyePJmvv/6af//3f+fZZ58lLS2NsWPHUlpayvXX\nX8/Pf/5zj+QfiBH/rLzRKisryyMfUr19QH377bd88MEHVFRUeKwwGY1Gxo0bx9q1a4f9A+rtt9+m\ntLQUs9nsMm6z2XjooYc88gH18ssv97mOJx9A/Nxzz6EoCjNnzuzxPMlvv/2WkpISTp486bH8QUFB\nrFixghUrVlBfX8/u3btZunQpU6dOZfHixcyfPx8/Pz+P5Nby5w6wdetWSkpKiI6OBi68GeFXv/oV\n9fX1rFixosfzPofa/v37+cMf/kBoaCh//etfWbNmDcXFxVx11VUA3H333dxzzz0e3YfLJYVJA1p+\nSHnzB5TBYHD74QRgNps99iHxxz/+kdDQUCZMmOB2nTNnzngkN8B//dd/UVhYSH19PY899hjx8f/3\ncOCuM+XMzEyP5b/Y1KlTyczMJDMzk88++4zdu3fzm9/8hptuuolf/vKXQ55Py587wNixY9WiBDB5\n8mR++9vf8uKLL/LrX/+aVatWeSw3QEREBKGhoQBcf/31TJgwQS1KXfs3HH8YXg4pTBrQ8kPKmz+g\nbrrpJlasWEFaWhrTpk0jODgYg8GA3W6nvr6enTt3kpiYOOR5AV588UWeffZZnnvuObcfgp487iEh\nIaxbt46vvvqKTZs20dHRweOPP05UVJS6zv/7f//PY/ndue6667juuus4f/48f/rTnzySQ8ufO1wo\njE1NTUycOLHb8rVr17Jt2zbWr1/vsdwAPj4+HD9+nClTpgAXzuAu1traqr8JMIoYdg0NDcrDDz+s\nOJ1Ot+s89thjHt2HU6dOKevXr1eysrKUhoYGRVEU5b777lMURVGampo8lrcrhzudnZ3KgQMHPJZ/\n+/btSmpqqjJz5kwlNjZWiYmJUWbOnKmkpqYqRUVFHsurKIryt7/9rddj+80333g0/8WOHTumrFmz\nRsnJyVHuvvtuj+dbvHixkpub6/E87mj5cy8pKVFuuOEG5a233nIZLy0tVWbNmuWx/Lt371buu+8+\n5ejRoz1i7733npKUlKRs3brVY/kHQiY/aOTvf/87kZGRbs+avv32WyIiIjy+H59//jmbNm0iLCyM\nmpoa3nvvPY/mS01N5frrrycnJ8ejefricDiw2WzAhaGc0T4b0Z3/+Z//4eDBgx4fTtILvf7cz5w5\nQ2BgoNa7oRtSmATgfR9QWlEUhYaGBpqbm9Upy1dffTU+PsNzS6G35/dmI+nYS2HSkJa/KFr/kmqd\n352//OUvHnlCvcPhYOPGjVit1m4vtwQwmUxYLBYyMzM99he8t+fvi6d+7nrI39exT09P5/HHH9fN\n2SNIYdKElv9Jtf6A0Dp/Xzx1L9H69etRFIX09HSio6PVWVB2u53a2lpKSkpQFIW8vLwhzy35++bJ\ne8i0zq/3Y++KFCYNaPmLovUvqZb5f/zjH/eYIn8xRVGw2Wx8+umnQ547IyODHTt2DHodyX/5tPy5\n6yG/1j/7gZDp4hqor693+UsQGhrKjTfeyI033khGRsaoy611/gULFqAoCrfddpvLuKIo/brHbCA6\nOzs5ceJEt+nZFztx4gSdnZ0eye3t+bX8ueshv9Y/+4GQwqQBLX9RtP4l1TL/k08+yc9+9jPuv/9+\nrrjiCpfrfPLJJx7JnZWVxd13301sbKx6Lw1ceCxOfX09NTU1FBQUeCS3t+fX8ueuh/xa/+wHQoby\nNFBRUUFWVlafvyieuBiqZW495NdSS0sLpaWlHD58mObmZgDCwsJISEggJSWFsLAwyS88YqQdeylM\nGtHyF0XrX1Kt8wshdG4o79YVYiTq62kUozW3t+f35r7rIX9v9HdnlRdbvny5V+bWOr+i4aCBlrm9\nPb83910P+XsjhUlH5D+pEELINSZdue+++9i2bZvX5dZDfiGEfkhhEkJjzzzzDLm5uZSWlpKcnOyR\nHHV1dbS1tXH99dcD8Le//Y2SkhKampqYMGECqamp3HDDDR7J3eXo0aPU1dVxww03MHHiROrq6igp\nKcHhcHDLLbdw6623eiz3uXPnOHDgAEeOHOn2GKyEhATmzZvn8SeNOBwOjh07htls5sorr1SXt7S0\nUFZWhsFgIDU11aP70KXrhaBffvklEyZMYNGiRW5v39CKccOGDRu03glv9+2337J9+3ZKS0uprq5m\nwoQJmEwmj+T61a9+xdixY5k0aZJHtt8fX331Fe+++y7vvPMOxcXFfPTRR3z66ac4HA6mTp3q8Td6\n9ubFF1/0yFT1L7/8kvb2dpdf27dv5+abb+a1117z2FtUV65cSWBgIDfccAP/+Z//yeOPP85VV13F\n5MmT+f7773n11VcJDw8nNjbWI/mLi4tZvXo1lZWVvPPOO8yaNYtHHnmEgIAAzpw5w5tvvom/vz/X\nXXfdkOeuqqoiIyODTz75BKfTia+vL06nky+//JLS0lLefvttfvCDH3jsaf5VVVVYLBbeeusttm3b\nxpEjR7jhhhsIDg7m/PnztLa2snbtWtasWeOR/D/72c/UP3hqampITU3FZrNhNBo5evQoL730EgkJ\nCfoqThpMuPB6jzzyiPrv6upq5Z//+Z8Vi8WiZGZmKvfcc49y7bXXKn/60588kvvHP/6x8uijjyq3\n3367UlBQoNTV1XkkjzsHDhxQrr32WiUtLU15+OGHlbS0NCU+Pl5Zt26dcv/99yupqakeex9UZWVl\nn1/33nuvR3LHxMQosbGx6ruALv7qWhYbG+uR3IqiKEuWLFH/vXz5cqW+vr5bvLm5uds6Qy01NVU5\nfvy4oiiK8tlnnym33HKLUlNTo8a/+uorJS0tzSO5lyxZonzyySdu4wcPHvRo3++77z6luLhY+e67\n75SWlhalpKRESUtLUz799FNFURTl9OnTSkxMjMfyX/w7vXr1auXjjz/uFv/b3/7m0f4PhDz5QQPf\nffed+u+XX36Z3/zmN92GMf7+97/z3HPPccsttwx57sjISLZs2UJbWxt79+4lJyeHjo4OUlJSSE5O\n9vg9RBs3buTdd9/t9ubcgwcPsmfPHt566y0qKyt59tln2bRp05DnfuKJJzAajfj6uv+173pXz1D7\n3e9+x6uvvsrs2bN54IEHGDNmjBrrur42XG8OPn/+vPo20y5hYWEefeJHUFAQV199NQDXXnstQUFB\nxMTEqPGJEycybtw4j+T28fHp9Q21N910k0efau90OrFYLMCF15jfeeed3Hrrraxbt46lS5eSmJjo\n0VGCi7dts9l6DJlec801vf6f0ILMytOAlr8oXblNJhNLlizhnXfe4aWXXuLs2bP867/+K4888gh7\n9uzxSG6AwMDAbkUJIDExkc8//xy48CHx7bffeiR3Xl4et912Gx9//LHbr6SkJI/kTkxMZOvWrUyd\nOpWHH36YoqIinE4n8H8/E08+o/DWW2/lmWee4euvvyYtLY3169dTU1PDN998w9GjR3n++ec9+mLK\nc+fOYbVaqa2t5bXXXiMgIID3339fjZeXl9PR0eGR3CaTiZdffpmvvvqqR6ypqYlXXnnFY0PncOHn\ne2nRN5vNvPzyy+zevZvi4mKP5b7UFVdcQWNjY7dlZ86ckWflie66flEmT56sLhvuX5RJkyaxcuVK\nVq5cSU1NDbt37+Zf/uVfPJJr3Lhx7Nq1i5SUFHx8fHA4HLzzzjvqm3w7Ojo4ffq0R3InJSXR2NjI\nqVOn3D6zbOnSpR7J3WXhwoUsWLCA4uJi7rvvPu655x51qvzNN9/ssbyPPPIIr7/+OikpKXR2dnL2\n7Fk++OADNZ6UlOTR1x48/fTTZGZm8u2333LNNdfw2muv8dBDD/H8889jNBpxOp1s3rzZI7nz8vLI\nzc3ltttuIyAggKCgIAwGA+3t7TgcDhYuXOjRvicmJnLvvfeyatUq5s6dqy739/fnN7/5Dbm5uR7L\n3ZVn/fr1wIU/EN5//32ysrIA2LlzJ1u3bmXBggUe3YfLJbPyNPDQQw8xfvx4AL7//nuuvvpql78o\nnhjaue2221i6dCkPP/zwkG+7P06cOMHKlSv58ssvCQkJobW1laioKN544w2uuOIK1qxZQ3BwMM8/\n/7wm+zeczpw5w1tvvcVnn33Gq6++Oiw5Ozs7qaqq4uTJkzgcDkwmEzNnzmTixInDkr+lpUUdLj53\n7hwVFRV0dnZy3XXXER4e7tHcbW1tVFVVqWfk4eHhxMXFERoa6tG833//PUeOHGH8+PFMnTrV5ToV\nFRUe/cPEnZaWFs6cOcP48ePlRYHCPU//opw8eRIfHx+3ZwzDobOzk88++4yvv/6ayMhIrrvuOs3f\nXCuE0A8pTBo7c+YMdrudgIAAj//lpqfcWuUvKytjx44dHDlyhJaWFuDCeH9CQgJLly7tNtQymnJL\n/t5988036iiG5NeeFCYNKIrC1q1bKS4u5sSJE+rywMBAEhMTeeSRRzxyP4fWubXO/8Ybb2C1WklJ\nSSE6OrrbKzdqa2spLS3FYrHwwAMPjKrckr9vo/nV6iMh/6WkMGngueee469//SupqamEh4fzzTff\nsGvXLu68806CgoLYtm0bq1atcvvGy5GaW+v8d911F0VFRW6HSDs6Oli2bBklJSWjKre35++68N+b\nyspK9u/fP+S5Jf/AyKw8DXzyySdYrdZu/0nvvPNOnnjiCV5//XV+9KMfsXLlSo98OGuZW+v8iqL0\ner+IwWDw2MNktczt7fm//vprTCYT06ZNc7tOYGCgR3JL/oGRwqSBoKCgHn85+vr6qjfeBgUFeWwy\ngJa5tc6fnJzM4sWLSU5OVt+eazAYsNvt1NfXs2fPHtLT00ddbm/Pn5+fz7/927+xcuVKt2dsx44d\n80huyT8wMpSngV//+tdUV1eTmpqK2WymqamJ4uJi5s+fz6pVq/jzn/9MXl4eu3fvHlW59ZC/vLyc\n4uJil2/PzcjI8NgNtlrn9vb8X3zxBePGjcNsNruM93VGJ/mHlxQmDXR2dvLb3/6W3//+9zQ1NREZ\nGUlaWhqPPPIIBoOBt956i4kTJ/LTn/50VOXWQ34hhP5JYRJex2azUVVVpb7+ICIigvj4+GGZsq5l\nbm/Pf2nu8PBw4uPj3Z5FSH7tyDUmDX366adUV1er9/JMmjSJ2bNne/xBqlrn1ip/W1sbOTk57Nu3\njzFjxhASEgKA3W6no6ODBQsWkJub65HnpmmZ29vze3Pf9ZB/IOSMSQMNDQ2sWbOGzz//nDFjxnD2\n7Fl8fX2ZMmUKX3/9NcnJyfz85z/3yINctcytdf5HH32UuLg4LBYLkZGR3WKnTp2ipKSEf/zjHxQW\nFo6q3N6e35v7rof8A+K5N2oId+69917lzTffVE6fPq0oiqK0t7crmzdvVl5//XXl7NmzyiuvvKI8\n//zzoy631vmXLl3a5zrLli0bdbm9Pb83910P+QdCHlCmAYfDwYoVK9T3zwQFBbFmzRo+/vhjAgIC\nWLVqFZ999tmoy611fqfTyX//93+7jR86dIjz58+Putzent+b+66H/AMh15g04OPj0+NVF3//+9+7\n3WPgqddeaJlb6/zPPPMMq1evxmg0Mm3atG6vP6ivr6ezs5NXXnll1OX29vze3Hc95B8IucakgfLy\ncp544gmuvfZa9V6ef/zjH7z22mvcdNNN5Obm8vnnn/POO++Mqtx6yH/u3DnKy8td3kszd+5cjz76\nX8vc3p7fm/uuh/yXSwqTRhoaGvjwww/Ve3mSk5O56qqrgAs3w40dO9ZjM9S0zK11/q+//ppPP/2U\nsLAwZs+erS7vughsMBhYvXr1qMvt7fm9ue96yH/ZtL3EJcTwqaioUK677jolMTFRuf7665WUlBSl\nqqpKURRF6ejoUD7//HMlNjZ21OX29vze3Hc95B8IKUw69Ytf/MIrc3sy/5IlS5Q///nP6veffPKJ\nsmTJEmXv3r2KoijKd999p8TExIy63N6e35v7rof8AyGTHzTwwQcf9LlOdXX1qMutdX4/P79ur69O\nTEzkd7/7HTk5OTQ3N5Oamuqx54Vpmdvb83tz3/WQfyCkMGlgy5YtBAUFqVOmXTl58uSoy62H/N99\n91233AEBAbzwwgvk5eVRUFDgsbxa5/b2/N7cdz3kv2xan7J5o8OHDytPPPFEr+s89thjoy631vnf\nfPNN5Uc/+pGya9cul/FXX33VY0MaWub29vze3Hc95B8ImZWnkf379xMfH9/jESFdjh07xowZM0Zd\nbi3zd3Z20tTUxLhx49w+NPT48eNMmTJlVOX29vze3Hc95B8IKUxCCCF0RR5JJIQQQlekMAkhhNAV\nKUxCCCF0RQqTEEIIXfn/AI1snaA4I+GAAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f98bb94cb10>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(tstamp, bins=50)\n",
    "xticks = np.linspace(tstamp[0], tstamp[-1], 10)\n",
    "plt.xticks(xticks, map(lambda x: timestamp_to_date(x)[:7], xticks), rotation=90)\n",
    "pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we select the data from 1995-01-01 to the last day as the dataset (i.e., all the dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "start_t = time.mktime(datetime.datetime.strptime(\"1995-01-01\", \"%Y-%m-%d\").timetuple())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "raw_data = raw_data[raw_data['timestamp'] >= start_t]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Take the first 80% of the data as train and validation set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tr_vd_raw_data = raw_data[:int(0.8 * raw_data.shape[0])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_count(tp, id):\n",
    "    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)\n",
    "    count = playcount_groupbyid.size()\n",
    "    return count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def filter_triplets(tp, min_uc=5, min_sc=0):\n",
    "    # Only keep the triplets for songs which were listened to by at least min_sc users. \n",
    "    if min_sc > 0:\n",
    "        songcount = get_count(tp, 'movieId')\n",
    "        tp = tp[tp['movieId'].isin(songcount.index[songcount >= min_sc])]\n",
    "    \n",
    "    # Only keep the triplets for users who listened to at least min_uc songs\n",
    "    # After doing this, some of the songs will have less than min_uc users, but should only be a small proportion\n",
    "    if min_uc > 0:\n",
    "        usercount = get_count(tp, 'userId')\n",
    "        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]\n",
    "    \n",
    "    # Update both usercount and songcount after filtering\n",
    "    usercount, songcount = get_count(tp, 'userId'), get_count(tp, 'movieId') \n",
    "    return tp, usercount, songcount"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tr_vd_raw_data, user_activity, item_popularity = filter_triplets(tr_vd_raw_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After filtering, there are 7992863 watching events from 111148 users and 11711 movies (sparsity: 0.614%)\n"
     ]
    }
   ],
   "source": [
    "sparsity = 1. * tr_vd_raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])\n",
    "\n",
    "print(\"After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)\" % \n",
    "      (tr_vd_raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "unique_uid = user_activity.index\n",
    "unique_sid = item_popularity.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))\n",
    "user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open(os.path.join(DATA_DIR, 'pro', 'unique_uid.txt'), 'w') as f:\n",
    "    for uid in unique_uid:\n",
    "        f.write('%s\\n' % uid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open(os.path.join(DATA_DIR, 'pro', 'unique_sid.txt'), 'w') as f:\n",
    "    for sid in unique_sid:\n",
    "        f.write('%s\\n' % sid)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Split 12.5% (10% of the total ratings) as validation set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "np.random.seed(13579)\n",
    "n_ratings = tr_vd_raw_data.shape[0]\n",
    "vad = np.random.choice(n_ratings, size=int(0.125 * n_ratings), replace=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vad_idx = np.zeros(n_ratings, dtype=bool)\n",
    "vad_idx[vad] = True\n",
    "\n",
    "vad_raw_data = tr_vd_raw_data[vad_idx]\n",
    "train_raw_data = tr_vd_raw_data[~vad_idx]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Make sure there is no empty users/items"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are total of 111148 unique users in the training set and 111148 unique users in the entire dataset\n"
     ]
    }
   ],
   "source": [
    "print \"There are total of %d unique users in the training set and %d unique users in the entire dataset\" % \\\n",
    "(len(pd.unique(train_raw_data['userId'])), len(unique_uid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are total of 11612 unique items in the training set and 11711 unique items in the entire dataset\n"
     ]
    }
   ],
   "source": [
    "print \"There are total of %d unique items in the training set and %d unique items in the entire dataset\" % \\\n",
    "(len(pd.unique(train_raw_data['movieId'])), len(unique_sid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_sid = set(pd.unique(train_raw_data['movieId']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "left_sid = list()\n",
    "for i, sid in enumerate(unique_sid):\n",
    "    if sid not in train_sid:\n",
    "        left_sid.append(sid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "move_idx = vad_raw_data['movieId'].isin(left_sid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train_raw_data = train_raw_data.append(vad_raw_data[move_idx])\n",
    "vad_raw_data = vad_raw_data[~move_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are total of 11711 unique items in the training set and 11711 unique items in the entire dataset\n"
     ]
    }
   ],
   "source": [
    "print \"There are total of %d unique items in the training set and %d unique items in the entire dataset\" % \\\n",
    "(len(pd.unique(train_raw_data['movieId'])), len(unique_sid))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For test data, only keep the users and items that appear in the training/validation sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_raw_data = raw_data[int(0.8 * len(raw_data)):]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_raw_data = test_raw_data[test_raw_data['movieId'].isin(unique_sid)]\n",
    "test_raw_data = test_raw_data[test_raw_data['userId'].isin(unique_uid)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6993860 999003 207161\n"
     ]
    }
   ],
   "source": [
    "print len(train_raw_data), len(vad_raw_data), len(test_raw_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Basic data information: what's the timespan for train/test?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train: from 1995-01-09 06:46:44 to 2009-10-19 06:51:15\n",
      "test: from 2009-10-19 06:51:53 to 2015-03-31 02:11:28\n"
     ]
    }
   ],
   "source": [
    "train_timestamp = np.asarray(tr_vd_raw_data['timestamp'])\n",
    "print(\"train: from %s to %s\" % (timestamp_to_date(train_timestamp[0]), \n",
    "                                timestamp_to_date(train_timestamp[-1])))\n",
    "\n",
    "test_timestamp = np.asarray(test_raw_data['timestamp'])\n",
    "print(\"test: from %s to %s\" % (timestamp_to_date(test_timestamp[0]), \n",
    "                               timestamp_to_date(test_timestamp[-1])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Numerize the data into (timestamp, user_index, item_index) format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def numerize(tp):\n",
    "    uid = map(lambda x: user2id[x], tp['userId'])\n",
    "    sid = map(lambda x: song2id[x], tp['movieId'])\n",
    "    tp['uid'] = uid\n",
    "    tp['sid'] = sid\n",
    "    return tp[['timestamp', 'uid', 'sid']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train_data = numerize(train_raw_data)\n",
    "train_data.to_csv(os.path.join(DATA_DIR, 'pro', 'train.csv'), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vad_data = numerize(vad_raw_data)\n",
    "vad_data.to_csv(os.path.join(DATA_DIR, 'pro', 'validation.csv'), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_data = numerize(test_raw_data)\n",
    "test_data.to_csv(os.path.join(DATA_DIR, 'pro', 'test.csv'), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
